options(warn=-2)
library(pacman)
p_load(dplyr)
source('../lib/import.R')
#
# Es una librería de funciones comunes desarrolladas a partir de este TP.
import('../lib/common-lib.R')
## [1] "-> './reflection.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './csv.R' script loadded successfuly!"
## [1] "-> './plot.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './importance.R' script loadded successfuly!"
## [1] "-> './correlation.R' script loadded successfuly!"
## [1] "-> './pca.R' script loadded successfuly!"
## [1] "-> './set_split.R' script loadded successfuly!"
## [1] "-> './models.R' script loadded successfuly!"
## ---
## biotools version 4.1
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> './test.R' script loadded successfuly!"
## [1] "-> './metrics.R' script loadded successfuly!"
## [1] "-> './balance.R' script loadded successfuly!"
## [1] "-> '../lib/common-lib.R' script loadded successfuly!"
#
# Funciones especificas para este TP.
import('../scripts/helper_functions.R')
## [1] "-> '../lib/pca.R' script loadded successfuly!"
## [1] "-> './data-frame.R' script loadded successfuly!"
## [1] "-> '../lib/importance.R' script loadded successfuly!"
## [1] "-> '../scripts/helper_functions.R' script loadded successfuly!"
set.seed(1)

1. Cargamos el dataset

1.1. Cargamos el dataset nasa-asteroids y excluirnos observaciones con valores faltaste.

dataset <- loadcsv('../datasets/nasa.csv')
str(dataset)
## 'data.frame':    4687 obs. of  40 variables:
##  $ Neo.Reference.ID            : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Name                        : int  3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.KM.min.          : num  0.1272 0.1461 0.2315 0.0088 0.1272 ...
##  $ Est.Dia.in.KM.max.          : num  0.2845 0.3266 0.5177 0.0197 0.2845 ...
##  $ Est.Dia.in.M.min.           : num  127.2 146.1 231.5 8.8 127.2 ...
##  $ Est.Dia.in.M.max.           : num  284.5 326.6 517.7 19.7 284.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Est.Dia.in.Feet.min.        : num  417.4 479.2 759.5 28.9 417.4 ...
##  $ Est.Dia.in.Feet.max.        : num  933.3 1071.6 1698.3 64.6 933.3 ...
##  $ Close.Approach.Date         : chr  "1995-01-01" "1995-01-01" "1995-01-08" "1995-01-15" ...
##  $ Epoch.Date.Close.Approach   : num  7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Orbiting.Body               : chr  "Earth" "Earth" "Earth" "Earth" ...
##  $ Orbit.ID                    : int  17 21 22 7 25 40 43 22 100 30 ...
##  $ Orbit.Determination.Date    : chr  "2017-04-06 08:36:37" "2017-04-06 08:32:49" "2017-04-06 09:20:19" "2017-04-06 09:15:49" ...
##  $ Orbit.Uncertainity          : int  5 3 0 6 1 1 1 0 0 0 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Equinox                     : chr  "J2000" "J2000" "J2000" "J2000" ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

1.2. Las siguientes columnas las excluimos ya sea por que son no numerica, o colineales con las columna que si son parte de nuestro analisis.

excluded_columns <- c(
  'Neo.Reference.ID',
  'Name',
  'Close.Approach.Date',
  'Epoch.Date.Close.Approach',
  'Orbit.ID',
  'Orbit.Determination.Date',
  'Orbiting.Body',
  'Est.Dia.in.Feet.min.',
  'Est.Dia.in.Feet.max.',
  'Est.Dia.in.M.min.',
  'Est.Dia.in.M.max.',
  'Est.Dia.in.KM.min.',
  'Est.Dia.in.KM.max.',
  'Equinox',
  'Orbit.Uncertainity'
)

ds_step_1 <- dataset %>% dplyr::select(-excluded_columns) %>% na.omit
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(excluded_columns)` instead of `excluded_columns` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(dataset)

str(ds_step_1)
## 'data.frame':    4687 obs. of  25 variables:
##  $ Absolute.Magnitude          : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.       : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Est.Dia.in.Miles.max.       : num  0.1768 0.203 0.3217 0.0122 0.1768 ...
##  $ Relative.Velocity.km.per.sec: num  6.12 18.11 7.59 11.17 9.84 ...
##  $ Relative.Velocity.km.per.hr : num  22017 65210 27327 40226 35427 ...
##  $ Miles.per.hour              : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..Astronomical.    : num  0.419 0.383 0.051 0.285 0.408 ...
##  $ Miss.Dist..lunar.           : num  163.2 149 19.8 111 158.6 ...
##  $ Miss.Dist..kilometers.      : num  62753692 57298148 7622912 42683616 61010824 ...
##  $ Miss.Dist..miles.           : num  38993336 35603420 4736658 26522368 37910368 ...
##  $ Minimum.Orbit.Intersection  : num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Jupiter.Tisserand.Invariant : num  4.63 5.46 4.56 5.09 5.15 ...
##  $ Epoch.Osculation            : num  2458000 2458000 2458000 2458000 2458000 ...
##  $ Eccentricity                : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Semi.Major.Axis             : num  1.41 1.11 1.46 1.26 1.23 ...
##  $ Inclination                 : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude          : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period              : num  610 426 644 514 496 ...
##  $ Perihelion.Distance         : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg              : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist               : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time             : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly                : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion                 : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                   : chr  "True" "False" "True" "False" ...

2. Eliminamos las columnas que están altamente co-relacionadas

Excluimos las columnas que tienen un correlación mayor al 80%. Del grupo altamenten correlacionesdos nos quedamos con una sola variable ya que todas con muy simialres.

high_correlated_columns <- find_high_correlated_columns(
  feat(ds_step_1), 
  cutoff=0.9
)
## Compare row 21  and column  15 with corr  0.975 
##   Means:  0.285 vs 0.213 so flagging column 21 
## Compare row 9  and column  7 with corr  1 
##   Means:  0.287 vs 0.206 so flagging column 9 
## Compare row 7  and column  10 with corr  1 
##   Means:  0.253 vs 0.199 so flagging column 7 
## Compare row 10  and column  8 with corr  1 
##   Means:  0.216 vs 0.196 so flagging column 10 
## Compare row 12  and column  15 with corr  0.93 
##   Means:  0.271 vs 0.191 so flagging column 12 
## Compare row 15  and column  18 with corr  0.995 
##   Means:  0.226 vs 0.184 so flagging column 15 
## Compare row 4  and column  5 with corr  1 
##   Means:  0.29 vs 0.175 so flagging column 4 
## Compare row 5  and column  6 with corr  1 
##   Means:  0.246 vs 0.163 so flagging column 5 
## Compare row 3  and column  2 with corr  1 
##   Means:  0.219 vs 0.153 so flagging column 3 
## Compare row 22  and column  13 with corr  0.978 
##   Means:  0.127 vs 0.151 so flagging column 13 
## All correlations <= 0.9
ds_step_2 <- ds_step_1 %>% dplyr::select(-high_correlated_columns[-1])

rm(ds_step_1)
str(ds_step_2)
## 'data.frame':    4687 obs. of  16 variables:
##  $ Absolute.Magnitude        : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.     : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Miles.per.hour            : num  13681 40519 16980 24995 22013 ...
##  $ Miss.Dist..lunar.         : num  163.2 149 19.8 111 158.6 ...
##  $ Minimum.Orbit.Intersection: num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Eccentricity              : num  0.426 0.352 0.348 0.217 0.21 ...
##  $ Inclination               : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Asc.Node.Longitude        : num  314.4 136.7 259.5 57.2 84.6 ...
##  $ Orbital.Period            : num  610 426 644 514 496 ...
##  $ Perihelion.Distance       : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Perihelion.Arg            : num  57.3 313.1 248.4 18.7 158.3 ...
##  $ Aphelion.Dist             : num  2.01 1.5 1.97 1.53 1.48 ...
##  $ Perihelion.Time           : num  2458162 2457795 2458120 2457902 2457814 ...
##  $ Mean.Anomaly              : num  264.8 173.7 292.9 68.7 135.1 ...
##  $ Mean.Motion               : num  0.591 0.845 0.559 0.7 0.726 ...
##  $ Hazardous                 : chr  "True" "False" "True" "False" ...

3. Tomamos las columnas que mejor separan las clases.

Para realizar este paso vamos a utilizar la función feature_importance del algoritmo Random Forest. Esta función nos permite comparar las variables descuerdo a cuan buenas son para separa las clases. Luego tomaremos las 5 variables que mejor separa las clases.

result <- features_importance(ds_step_2, target = 'Hazardous')
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(target)` instead of `target` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
result
## 
## Call:
##  randomForest(x = features, y = target, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.28%
## Confusion matrix:
##       False True class.error
## False  3926    6 0.001525941
## True      7  748 0.009271523
plot_features_importance(result)

n_best_features = 5
# n_best_features = 15

best_features <- top_acc_features(result, top=n_best_features)
## Selecting by MeanDecreaseGini
best_features
## [1] "Minimum.Orbit.Intersection" "Absolute.Magnitude"        
## [3] "Est.Dia.in.Miles.min."      "Perihelion.Distance"       
## [5] "Inclination"
length(best_features)
## [1] 5
ds_step_3 <- ds_step_2 %>% dplyr::select(c(best_features, c(Hazardous)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(best_features)` instead of `best_features` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
rm(ds_step_2)
str(ds_step_3)
## 'data.frame':    4687 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Absolute.Magnitude        : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.     : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Perihelion.Distance       : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Inclination               : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Hazardous                 : chr  "True" "False" "True" "False" ...

4. Filtramos outliers

# ds_step_4 <- filter_outliers_m1(ds_step_3, max_score = 0.51)
ds_step_4 <- filter_outliers_m2(ds_step_3)
## [1] "Outliers: 4 %"
## [1] "Dataset without outliers: 96 %"
rm(ds_step_3)

5. Transformamos la varaible a predecir a tipo numericas

ds_step_5 <- ds_step_4 %>% 
  mutate(Hazardous = case_when(Hazardous %in% c('True') ~ 1, TRUE ~ 0))

rm(ds_step_4)
str(ds_step_5)
## 'data.frame':    4493 obs. of  6 variables:
##  $ Minimum.Orbit.Intersection: num  0.02528 0.18693 0.04306 0.00551 0.0348 ...
##  $ Absolute.Magnitude        : num  21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
##  $ Est.Dia.in.Miles.min.     : num  0.07905 0.09076 0.14385 0.00547 0.07905 ...
##  $ Perihelion.Distance       : num  0.808 0.718 0.951 0.984 0.968 ...
##  $ Inclination               : num  6.03 28.41 4.24 7.91 16.79 ...
##  $ Hazardous                 : num  1 0 1 0 1 0 0 0 0 1 ...

6. Análisis Exploratorio

6.1. Boxplot comparativos

coparative_boxplot(feat(ds_step_5), to_col=n_best_features)

6.2. Histogramas y densidad

comparative_histplot(feat(ds_step_5), to_col=n_best_features)

6.3. Analizamos gráfico de normalidad univariado

## [[1]]
## [1] 3.7e-24
## 
## [[2]]
## [1] 3.7e-24
## 
## [[3]]
## [1] 3.7e-24
## 
## [[4]]
## [1] 3.7e-24
## 
## [[5]]
## [1] 3.7e-24

Observaciones: Al parecer solo Perihelion.Distance parece ser normal o por lo enos la mas nromal de todas.

6.3. Test de normalidad uni-variado

uni_shapiro_test(feat(ds_step_5))
## [1] "=> Variable: Minimum.Orbit.Intersection"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.83111, p-value < 2.2e-16
## 
## [1] "=> Variable: Absolute.Magnitude"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.9821, p-value < 2.2e-16
## 
## [1] "=> Variable: Est.Dia.in.Miles.min."
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.75483, p-value < 2.2e-16
## 
## [1] "=> Variable: Perihelion.Distance"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.98592, p-value < 2.2e-16
## 
## [1] "=> Variable: Inclination"
## [1] ""
## 
##  Shapiro-Wilk normality test
## 
## data:  features[, col_index]
## W = 0.91502, p-value < 2.2e-16

Observaciones: En todos los casos el p-valor < 0.05 y se rechaza normalidad en todas las variables. Esto coincido con los qqplot donde en todos los casos no son normales salvo Perihelion.Distance que parece tender anormalidad.

6.4. Test de normalidad muti-variado

mult_shapiro_test(feat(ds_step_5))
## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.71502, p-value < 2.2e-16

Observaciones: El p-valore < 0.05 por lo tanto se rechaza normalidad multi-variada. Se corresponde con el resultado de los tests de shapiro uni-variados y los qqplot’s.

6.5. Test de homocedasticidad multi-variado

multi_boxm_test(feat(ds_step_5), ds_step_5$Hazardous)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  features
## Chi-Sq (approx.) = 3242.4, df = 15, p-value < 2.2e-16

Observaciones: El p-valor < 0.05 por lo tanto se rechaza la hipótesis nula y podemos decir que las variables no son homocedasticas.

6.6. Correlaciones entre variables

plot_correlations(feat(ds_step_5))

6.7. Análisis completo

6.8. PCA: Comparación de variables originales con/sin la variable a predecir.

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4493 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 4493 individuals, described by 5 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

6.9. PCA: Con observaciones discriminadas pro clase. Primero quitamos

outliers para poder ver con mas claridad el biplot.

plot_robust_pca(ds_step_5)

7. Train test split

Partimos el dataset en los conjuntos de entrenamiento y prueba. Ademas antes de partimos ordenamos las observaciones aleatoriamente para que los modelos de clasificación no aprendan secuencia si es que existe alguna en el orden original.

c(raw_train_set, raw_test_set) %<-% train_test_split(
  ds_step_5, 
  train_size = .8, 
  shuffle    = TRUE
)
## [1] "Train Set size: 3594"
## [1] "Test set size: 899"

8. Método SMOTE para balancear el dataset.

# balanced_train_set <- smote_balance(raw_train_set, raw_train_set$Hazardous)
#rm(raw_train_set)
balanced_train_set <- raw_train_set

9. Escalamos las variables numéricas

Restamos la media y dividimos por el desvío.

train_set <- balanced_train_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))
test_set  <- raw_test_set %>% 
  mutate_at(vars(-Hazardous), ~(scale(.) %>% as.vector))

rm(balanced_train_set)
rm(raw_test_set)

10. Clasificacion

10.1. Entrenamos un modelo LDA

lda_model       <- lda(formula(Hazardous~.), train_set)

lda_test_pred   <- predict(lda_model, feat(test_set))
lda_train_pred  <- predict(lda_model, feat(train_set))
plot_cm(lda_test_pred$class, target(test_set))

plot_roc(lda_test_pred$class, target(test_set))

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 149      
##  Mean of positive(s): 1.691      
##  Variance of positive(s): 0.2149 
##  ===== Negative(s) =====         
##  Number of negative(s): 750      
##  Mean of negative(s): 1.029      
##  Variance of negative(s): 0.02851
##  ===== AUC =====                 
##  Area under curve: 0.831         
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.02933 0.6913
##  1.00000 1.0000
fbeta_score(prediction=lda_test_pred$class, reality=target(test_set), beta=2)
## [1] " F2Score: 0.714285714285714"
lda_model$scaling
##                                    LD1
## Minimum.Orbit.Intersection -1.25901613
## Absolute.Magnitude         -1.61036721
## Est.Dia.in.Miles.min.      -0.32194894
## Perihelion.Distance         0.09444703
## Inclination                -0.04451865

10.2. Entrenamos un modelo QDA

qda_model      <- qda(formula(Hazardous~.), train_set)

qda_test_pred  <- predict(qda_model, feat(test_set))
qda_train_pred <- predict(qda_model, feat(train_set))
plot_cm(qda_test_pred$class, target(test_set))

plot_roc(qda_test_pred$class, target(test_set))

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 149      
##  Mean of positive(s): 1.94       
##  Variance of positive(s): 0.05714
##  ===== Negative(s) =====         
##  Number of negative(s): 750      
##  Mean of negative(s): 1.019      
##  Variance of negative(s): 0.01834
##  ===== AUC =====                 
##  Area under curve: 0.9605        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.01867 0.9396
##  1.00000 1.0000
fbeta_score(prediction=qda_test_pred$class, reality=target(test_set), beta=2)
## [1] " F2Score: 0.933333333333333"

10.3. Entrenamos un modelo RDA

rda_model      <- rda(formula(Hazardous~.), train_set)

rda_test_pred  <- predict(rda_model, test_set)
rda_train_pred <- predict(rda_model, train_set)
plot_cm(rda_test_pred$class, target(test_set))

plot_roc(rda_test_pred$class, target(test_set))

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 149      
##  Mean of positive(s): 1.94       
##  Variance of positive(s): 0.05714
##  ===== Negative(s) =====         
##  Number of negative(s): 750      
##  Mean of negative(s): 1.019      
##  Variance of negative(s): 0.01834
##  ===== AUC =====                 
##  Area under curve: 0.9605        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.01867 0.9396
##  1.00000 1.0000
fbeta_score(rda_test_pred$class, target(test_set), beta=2)
## [1] " F2Score: 0.933333333333333"

10.4. Entrenamos un modelo de regresión logística

lr_model <- glm(formula(Hazardous~.), train_set, family=binomial)

lr_threshold <- 0.01

lr_test_pred <- predict(lr_model, test_set)
lr_test_pred_threshold <- ifelse(lr_test_pred >= lr_threshold, 1, 0)

lr_train_pred <- predict(lr_model, train_set)
lr_train_pred_threshold <- ifelse(lr_train_pred >= lr_threshold, 1, 0)
plot_cm(lr_test_pred_threshold, target(test_set))

plot_roc(lr_test_pred_threshold, target(test_set))

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 149      
##  Mean of positive(s): 0.8322     
##  Variance of positive(s): 0.1406 
##  ===== Negative(s) =====         
##  Number of negative(s): 750      
##  Mean of negative(s): 0.02       
##  Variance of negative(s): 0.01963
##  ===== AUC =====                 
##  Area under curve: 0.9061        
##                                  
##  =========================       
##   FPR    TPR
##  0.00 0.0000
##  0.02 0.8322
##  1.00 1.0000
fbeta_score(lr_test_pred_threshold, target(test_set), beta=2)
## [1] " F2Score: 0.843537414965986"

10.5. Entrenamos un modelo SVM

svm_model <- svm(formula(Hazardous~.), train_set, kernel = 'radial')

svm_threshold <- 0.20

svm_test_pred <- predict(svm_model, test_set)
svm_test_pred_threshold <- ifelse(svm_test_pred >= svm_threshold, 1, 0)

svm_train_pred <- predict(svm_model, train_set)
svm_train_pred_threshold <- ifelse(svm_train_pred >= svm_threshold, 1, 0)
plot_cm(svm_test_pred_threshold, target(test_set))

plot_roc(svm_test_pred_threshold, target(test_set))

##                                  
##  Method used: empirical          
##  ===== Positive(s) =====         
##  Number of positive(s): 149      
##  Mean of positive(s): 0.9866     
##  Variance of positive(s): 0.01333
##  ===== Negative(s) =====         
##  Number of negative(s): 750      
##  Mean of negative(s): 0.06933    
##  Variance of negative(s): 0.06461
##  ===== AUC =====                 
##  Area under curve: 0.9586        
##                                  
##  =========================       
##      FPR    TPR
##  0.00000 0.0000
##  0.06933 0.9866
##  1.00000 1.0000
fbeta_score(svm_test_pred_threshold, target(test_set), beta=2)
## [1] " F2Score: 0.924528301886793"

10.6. Entrenamos un modelo XGBoost

xgboost_model <- xgboost(
 as.matrix(feat(train_set)), 
 target(train_set),
 eta         = 0.2,
 max_depth   = 20,
 nround      = 15000,
 eval_metric = 'logloss',
 objective   = "binary:logistic",
 nthread     = 24,
 verbose     = 0
)
xgb_threshold <- 0.1

xgb_test_pred <- predict(xgboost_model, as.matrix(feat(test_set)))
xgb_test_pred_threshold <- ifelse(xgb_test_pred >= xgb_threshold, 1, 0)

xgb_train_pred <- predict(xgboost_model, as.matrix(feat(train_set)))
xgb_train_pred_threshold <- ifelse(xgb_train_pred >= xgb_threshold, 1, 0)

plot_cm(xgb_test_pred_threshold, target(test_set))

plot_roc(xgb_test_pred_threshold, target(test_set))

##                                   
##  Method used: empirical           
##  ===== Positive(s) =====          
##  Number of positive(s): 149       
##  Mean of positive(s): 0.9732      
##  Variance of positive(s): 0.0263  
##  ===== Negative(s) =====          
##  Number of negative(s): 750       
##  Mean of negative(s): 0.009333    
##  Variance of negative(s): 0.009259
##  ===== AUC =====                  
##  Area under curve: 0.9819         
##                                   
##  =========================        
##       FPR    TPR
##  0.000000 0.0000
##  0.009333 0.9732
##  1.000000 1.0000
fbeta_score(xgb_test_pred_threshold, target(test_set), beta=2)
## [1] " F2Score: 0.969251336898396"

10.7. Comparativa de metricas

data.frame(
  Test.F2Score.Percent = c(
    fbeta_score(lda_test_pred$class,     target(test_set), beta=2, show=F),
    fbeta_score(qda_test_pred$class,     target(test_set), beta=2, show=F),
    fbeta_score(rda_test_pred$class,     target(test_set), beta=2, show=F),
    fbeta_score(lr_test_pred_threshold,  target(test_set), beta=2, show=F),
    fbeta_score(svm_test_pred_threshold, target(test_set), beta=2, show=F),
    fbeta_score(xgb_test_pred_threshold, target(test_set), beta=2, show=F)
  ),
  Train.F2Score.Percent = c(
    fbeta_score(lda_train_pred$class,     target(train_set), beta=2, show=F),
    fbeta_score(qda_train_pred$class,     target(train_set), beta=2, show=F),
    fbeta_score(rda_train_pred$class,     target(train_set), beta=2, show=F),
    fbeta_score(lr_train_pred_threshold,  target(train_set), beta=2, show=F),
    fbeta_score(svm_train_pred_threshold, target(train_set), beta=2, show=F),
    fbeta_score(xgb_train_pred_threshold, target(train_set), beta=2, show=F)
  ),
  Model = c(
    'LDA', 
    'QDA', 
    'RDA', 
    'Regresion Logistica',
    'SVM',
    'XGBoost'
  ),
  Umbral = c(
    0.5, 
    0.5, 
    0.5, 
    lr_threshold,
    svm_threshold,
    xgb_threshold
  )
) %>% 
  arrange(desc(Test.F2Score.Percent)) %>%
  mutate(Diff.F2Score.Percent = abs(Test.F2Score.Percent - Train.F2Score.Percent)) %>%
  mutate(
    Test.F2Score.Percent  = round(Test.F2Score.Percent * 100, 3),
    Train.F2Score.Percent = round(Train.F2Score.Percent * 100, 3),
    Diff.F2Score.Percent  = round(Diff.F2Score.Percent, 3)
  ) %>%
  dplyr::select(
    Test.F2Score.Percent,
    Train.F2Score.Percent,
    Diff.F2Score.Percent,
    Model,
    Umbral
  )

11. KMeans Clustering

11.1. Primero definimos cuantos grupos utilizar.

Típica con estimadores de la normal.

scaled_data_1 <- feat(ds_step_5) %>% mutate(~(scale(.) %>% as.vector))
clustering_metrics_plot(scaled_data_1)

Escalamiento diferente de la típica normal.

scaled_data_2 <- apply(
  feat(ds_step_5), 
  2, 
  function(x) { (x - min(x)) / (max(x) - min(x))}
)
clustering_metrics_plot(scaled_data_2)

11.2. Kmeans

n_clusters <- 2
kmeans_model <- kmeans(scaled_data_1, n_clusters)

km_ds_step_5 <- data.frame(ds_step_5)
km_ds_step_5$kmeans <- kmeans_model$cluster

10.2. biplot

# ds_without_outliers <- filter_outliers_m1(km_ds_step_5, max_score=0.5)
ds_without_outliers <- filter_outliers_m2(km_ds_step_5)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-kmeans),
  groups = factor(ds_without_outliers$kmeans),
)

11.3 Hierarchical Clustering

Matriz de distancias euclídeas

mat_dist <- dist(x = ds_step_5, method = "euclidean") 

Dendrogramas (según el tipo de segmentación jerárquica aplicada)

hc_complete <- hclust(d = mat_dist, method = "complete") 
hc_average  <- hclust(d = mat_dist, method = "average")
hc_single   <- hclust(d = mat_dist, method = "single")
hc_ward     <- hclust(d = mat_dist, method = "ward.D2")

Calculo del coeficiente de correlación cofenetico

cor(x = mat_dist, cophenetic(hc_complete))
## [1] 0.7394402
cor(x = mat_dist, cophenetic(hc_average))
## [1] 0.7706933
cor(x = mat_dist, cophenetic(hc_single))
## [1] 0.4175976
cor(x = mat_dist, cophenetic(hc_ward))
## [1] 0.7243304

Construcción de un dendograma usando los resultados de la técnica de Ward

n_clusters <- 2

graphics.off()
plot(hc_ward )
rect.hclust(hc_ward, k=n_clusters, border="red")
hc_ds <- data.frame(ds_step_5)
hc_ds$her_ward <- cutree(hc_ward, k=n_clusters)
# ds_without_outliers <- filter_outlier_m1(hc_ds, max_score=0.57)
ds_without_outliers <- filter_outliers_m2(hc_ds)
## [1] "Outliers: 5 %"
## [1] "Dataset without outliers: 95 %"
plot_robust_pca(
  ds_without_outliers %>% dplyr::select(-her_ward),
  groups = factor(ds_without_outliers$her_ward),
  colours=c("orange","cyan","blue","magenta","yellow","black"),
  labels=c("grupo 1", "grupo 2","grupo 3","grupo 4","grupo 5","grupo 6")
)

 

Realizado por Adrian Marino

adrianmarino@gmail.com